import nltk
from nltk.probability import LidstoneProbDist, LaplaceProbDist
from nltk.corpus import brown
from nltk import FreqDist, ConditionalFreqDist
import random
import jieba

# 示例文本 读取ylk.txt
text = open("kyj.txt", encoding="utf-8").read()
# jieba分词
tokens = jieba.cut(text)
# 生成bigrams
bi_grams = list(nltk.ngrams(tokens, 2))
# 创建条件频率分布对象
cfd = ConditionalFreqDist(bi_grams)

# # 使用Lidstone平滑
# # gamma值小于1的Lidstone平滑
lidstone_cfd = {condition: LidstoneProbDist(cfd[condition], gamma=0.1) for condition in cfd.conditions()}

# 使用Laplace平滑
# Laplace平滑是gamma=1的特殊情况
laplace_cfd = {condition: LaplaceProbDist(cfd[condition]) for condition in cfd.conditions()}

def generate_text(initial_word, cfd, num_words=50):
    current_word = initial_word
    generated_text = [current_word]

    for _ in range(num_words - 1):
        if current_word not in cfd:
            break
        next_word = random.choices(
            population=list(cfd[current_word].samples()),
            weights=[cfd[current_word].prob(w) for w in cfd[current_word].samples()]
        )[0]
        generated_text.append(next_word)
        current_word = next_word

    return ''.join(generated_text)

# 示例：从"哈哈"开始生成文本
print(generate_text('孔乙己', laplace_cfd, 20))

